{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3538cf2a",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5109452a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:52:22.946636Z",
     "start_time": "2025-04-16T05:52:07.803372Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.10.1\n",
      "4.3.0\n"
     ]
    }
   ],
   "source": [
    "import scipy\n",
    "import gensim\n",
    "print(scipy.__version__)\n",
    "print(gensim.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5c5a740a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:22.165565Z",
     "start_time": "2025-04-16T05:52:22.949712Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "01a2e4a5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:22.454576Z",
     "start_time": "2025-04-16T05:53:22.168620Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124362826347351),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751145362854),\n",
      " ('starring', 0.7573285102844238),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389974594116)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.781263530254364),\n",
      " ('motorcycle', 0.7553156614303589),\n",
      " ('vehicles', 0.7462257146835327),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.737270712852478),\n",
      " ('taxi', 0.7155269384384155)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b62f7ad",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:22.572240Z",
     "start_time": "2025-04-16T05:53:22.463092Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "590fc408",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:24.142357Z",
     "start_time": "2025-04-16T05:53:22.577295Z"
    }
   },
   "outputs": [],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "#import nltk\n",
    "#nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "efc882de",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:24.364181Z",
     "start_time": "2025-04-16T05:53:24.144979Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "30903b3d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T05:53:24.377759Z",
     "start_time": "2025-04-16T05:53:24.366231Z"
    }
   },
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1d9da6c8",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T06:03:24.339695Z",
     "start_time": "2025-04-16T05:53:24.380768Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=3.0793: 100%|█| 100/100 [09:56<00:00,  5.96s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGwCAYAAABcnuQpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJmklEQVR4nO3deXhU5d0+8PtMJplskz1kIQsBgiwBWcUEJKgsAiLIa1VAxfatiqiIVEFK+xOrEl9qKSqWVlsRFwq1INKiQFCIICIhBAhhh0ACZAjZ90xm5vn9MZlDhgQSkjmzJPfnuuaSOedM5psDZu58n+c8RxJCCBARERG5KJWjCyAiIiJqD4YZIiIicmkMM0REROTSGGaIiIjIpTHMEBERkUtjmCEiIiKXxjBDRERELk3t6AKUZjKZcPnyZWi1WkiS5OhyiIiIqBWEEKioqEBkZCRUqpv3Xjp8mLl8+TKio6MdXQYRERG1QV5eHqKiom56TIcPM1qtFoD5ZPj5+Tm4GiIiImqN8vJyREdHy5/jN9Phw4xlaMnPz49hhoiIyMW0ZooIJwATERGRS2OYISIiIpfGMENEREQujWGGiIiIXBrDDBEREbk0hhkiIiJyaQwzRERE5NIYZoiIiMilMcwQERGRS2OYISIiIpfGMENEREQujWGGiIiIXBrDjA2ZTAK19UZHl0FERNSpMMzY0JOfpGPE29+jorbe0aUQERF1GgwzNmIyCew9U4iiKj1yi6sdXQ4REVGnwTBjI4VVdTCYBACgtt7k4GqIiIg6D4YZG9GV1cp/rjNw3gwREZG9MMzYSH7jMMPODBERkd04NMwsWbIEkiRZPcLDw+X9QggsWbIEkZGR8PLywujRo5Gdne3Aim+MnRkiIiLHcHhnpl+/fsjPz5cfWVlZ8r5ly5Zh+fLlWLlyJdLT0xEeHo6xY8eioqLCgRU3r3FnhnNmiIiI7Eft8ALUaqtujIUQAitWrMDixYsxbdo0AMCaNWsQFhaGtWvX4plnnmn269XV1aGurk5+Xl5erkzh19GV1VyrgZ0ZIiIiu3F4Z+b06dOIjIxEXFwcHn30UZw7dw4AkJOTA51Oh3HjxsnHajQaJCcnY+/evTf8eikpKfD395cf0dHRin8PADszREREjuLQMDN8+HB8+umn2LZtGz766CPodDokJSWhqKgIOp0OABAWFmb1mrCwMHlfcxYtWoSysjL5kZeXp+j3YKErbxxm2JkhIiKyF4cOM02YMEH+c//+/ZGYmIgePXpgzZo1uPPOOwEAkiRZvUYI0WRbYxqNBhqNRpmCb0AIYX01k4GdGSIiIntx+DBTYz4+Pujfvz9Onz4tz6O5vgtTUFDQpFvjaCXV9dA3CjDszBAREdmPU4WZuro6HD9+HBEREYiLi0N4eDhSU1Pl/Xq9HmlpaUhKSnJglU3lN5r8C7AzQ0REZE8OHWZ6+eWXMXnyZMTExKCgoABvvvkmysvLMWvWLEiShHnz5mHp0qWIj49HfHw8li5dCm9vb8yYMcORZTfReI0ZgJ0ZIiIie3JomLl48SKmT5+OwsJChIaG4s4778S+ffsQGxsLAFiwYAFqamowZ84clJSUYPjw4di+fTu0Wq0jy24i/7oww84MERGR/Tg0zKxbt+6m+yVJwpIlS7BkyRL7FNRGls6MSgJMgp0ZIiIie3KqOTOuytKZ6RroBYCdGSIiIntimLEBXbl5AnC3YB8A7MwQERHZE8OMDVg6M5Yww7tmExER2Q/DTDsJIeQ5M91CGsIM781ERERkNwwz7VRea0C13hxeugV7A+C9mYiIiOyJYaadLF2ZQG93+Hu5A2BnhoiIyJ4YZtrJsvpvuL8XPN3dALAzQ0REZE8MM+1k6cxE+HtCozafTnZmiIiI7Idhpp0sVzKF+3uyM0NEROQADDPtJHdm/Kw7M0IIR5ZFRETUaTDMtFN++bXOjKahM2MSQL2RYYaIiMgeGGbaSdcwATjC30vuzABALefNEBER2QXDTDs1njOjUasgSebtXAWYiIjIPhhm2qGyzoCKWgMAc5iRJEnuzvD+TERERPbBMNMOlsm/Wk81fDVqAIBGbZ43wztnExER2QfDTDs0XmPGwtOdnRkiIiJ7Yphph8ar/1qwM0NERGRfDDPt0HiNGQtLZ6aOnRkiIiK7YJhph8ZrzFiwM0NERGRfDDPtwDkzREREjscw0w6N15ixsHRmuGgeERGRfTDMtEPj1X8trs2Z4TATERGRPTDMtFGN3oiS6noA13Vm5DtnszNDRERkDwwzbaRrmPzr7eEGP0+1vP3anbPZmSEiIrIHhpk2urbGjPk2BhaecmeGYYaIiMgeGGbaqLkrmYDGnRkOMxEREdkDw0wbyVcy+XlZbWdnhoiIyL7ULR9Czenf1R9PJMZicEyg1Xb5rtnszBAREdkFw0wbjeoVilG9Qptst3RmeGk2ERGRfXCYycbYmSEiIrIvhhkbY2eGiIjIvhhmbExeAZidGSIiIrtgmLEx+a7Z7MwQERHZBcOMjcl3zWZnhoiIyC4YZmyMnRkiIiL7YpixMXZmiIiI7IthxsYsnRneNZuIiMg+GGZs7NrVTBxmIiIisgeGGRtjZ4aIiMi+GGZsTNOoMyOEcHA1REREHR/DjI1ZVgAWAtAbOdRERESkNIYZG7PcmwngvBkiIiJ7YJixMQ83FSTJ/GfOmyEiIlIew4yNSZIkd2e4cB4REZHyGGYUIN85mwvnERERKY5hRgGWzkwtOzNERESKY5hRADszRERE9sMwowB2ZoiIiOyHYUYB7MwQERHZD8OMAjzlWxqwM0NERKQ0hhkFXLulATszRERESmOYUYCGnRkiIiK7YZhRgKUzwxWAiYiIlMcwowDLnBnem4mIiEh5DDMKYGeGiIjIfhhmFMDODBERkf0wzCiAnRkiIiL7cZowk5KSAkmSMG/ePHmbEAJLlixBZGQkvLy8MHr0aGRnZzuuyFZiZ4aIiMh+nCLMpKen48MPP8SAAQOsti9btgzLly/HypUrkZ6ejvDwcIwdOxYVFRUOqrR1PNmZISIishuHh5nKykrMnDkTH330EQIDA+XtQgisWLECixcvxrRp05CQkIA1a9aguroaa9eudWDFLbPcm4mdGSIiIuU5PMw899xzmDRpEsaMGWO1PScnBzqdDuPGjZO3aTQaJCcnY+/evTf8enV1dSgvL7d62Jt8byZ2ZoiIiBSnduSbr1u3DgcPHkR6enqTfTqdDgAQFhZmtT0sLAwXLly44ddMSUnB66+/bttCb9G1CcDszBARESnNYZ2ZvLw8vPjii/j888/h6el5w+MkSbJ6LoRosq2xRYsWoaysTH7k5eXZrObWujYBmJ0ZIiIipTmsM5ORkYGCggIMGTJE3mY0GvHDDz9g5cqVOHnyJABzhyYiIkI+pqCgoEm3pjGNRgONRqNc4a3AzgwREZH9OKwzc++99yIrKwuHDh2SH0OHDsXMmTNx6NAhdO/eHeHh4UhNTZVfo9frkZaWhqSkJEeV3SrszBAREdmPwzozWq0WCQkJVtt8fHwQHBwsb583bx6WLl2K+Ph4xMfHY+nSpfD29saMGTMcUXKradx512wiIiJ7cegE4JYsWLAANTU1mDNnDkpKSjB8+HBs374dWq3W0aXd1LVLs9mZISIiUpokhBCOLkJJ5eXl8Pf3R1lZGfz8/OzynmcKKjFmeRr8vdxx+LVxLb+AiIiIrNzK57fD15npiNiZISIish+GGQV4Npoz08EbX0RERA7HMKMAy6XZAG9pQEREpDSGGQVYLs0GGGaIiIiUxjCjAHc3CZZFinl/JiIiImUxzChAkqRGC+exM0NERKQkhhmFeMq3NGBnhoiISEkMMwrRsDNDRERkFwwzCmFnhoiIyD4YZhRi6czw/kxERETKYphRiKUzw1WAiYiIlMUwoxB2ZoiIiOyDYUYhGnZmiIiI7IJhRiHszBAREdkHw4xCOGeGiIjIPhhmFNL4ztlERESkHIYZhWjU7MwQERHZA8OMQtiZISIisg+GGYVYOjNcAZiIiEhZDDMKsXRmeG8mIiIiZTHMKESeM8PODBERkaIYZhTCzgwREZF9MMwohHNmiIiI7INhRiHszBAREdkHw4xCLCsAszNDRESkLIYZhVjuzcTODBERkbIYZhSiYWeGiIjILhhmFCLfNZu3MyAiIlIUw4xC5Ltm83YGREREimKYUYjcmeEwExERkaIYZhQid2Y4AZiIiEhRDDMKaXw1kxDCwdUQERF1XAwzCrF0ZgB2Z4iIiJTEMKMQywrAACcBExERKYlhRiFqlQSVZP5zHS/PJiIiUgzDjEIkSZK7M7XszBARESmGYUZB8p2z2ZkhIiJSDMOMguQ7Z7MzQ0REpBiGGQWxM0NERKQ8hhkFsTNDRESkPIYZBcmdGd7SgIiISDEMMwrSuF9bBZiIiIiUwTCjoGuXZrMzQ0REpBSGGQVxAjAREZHyGGYUxAnAREREymOYURA7M0RERMpjmFGQ5c7Z7MwQEREph2FGQRp1wwRgdmaIiIgUwzCjIHZmiIiIlMcwoyBLZ6aOnRkiIiLFMMwoiJ0ZIiIi5THMKEheNI+dGSIiIsUwzCjo2r2Z2JkhIiJSCsOMgrw81ACAqjqDgyshIiLquBhmFBTg5Q4AKKupd3AlREREHRfDjIICvBlmiIiIlMYwoyB/dmaIiIgUxzCjoAAvDwBAtd7ItWaIiIgU4tAws2rVKgwYMAB+fn7w8/NDYmIivv32W3m/EAJLlixBZGQkvLy8MHr0aGRnZzuw4luj9VRDksx/ZneGiIhIGQ4NM1FRUXj77bdx4MABHDhwAPfccw+mTJkiB5Zly5Zh+fLlWLlyJdLT0xEeHo6xY8eioqLCkWW3mkolwc/TPNRUzjBDRESkCIeGmcmTJ2PixIno1asXevXqhbfeegu+vr7Yt28fhBBYsWIFFi9ejGnTpiEhIQFr1qxBdXU11q5d68iyb4ll3kxpNcMMERGREpxmzozRaMS6detQVVWFxMRE5OTkQKfTYdy4cfIxGo0GycnJ2Lt37w2/Tl1dHcrLy60ejsQrmoiIiJTl8DCTlZUFX19faDQazJ49G1999RX69u0LnU4HAAgLC7M6PiwsTN7XnJSUFPj7+8uP6OhoRetvCTszREREynJ4mLnttttw6NAh7Nu3D88++yxmzZqFY8eOyfslywzaBkKIJtsaW7RoEcrKyuRHXl6eYrW3Bi/PJiIiUpba0QV4eHigZ8+eAIChQ4ciPT0d7777LhYuXAgA0Ol0iIiIkI8vKCho0q1pTKPRQKPRKFv0LbAMM5UyzBARESnC4Z2Z6wkhUFdXh7i4OISHhyM1NVXep9frkZaWhqSkJAdWeGvkzky13sGVEBERdUwO7cz89re/xYQJExAdHY2KigqsW7cOu3btwtatWyFJEubNm4elS5ciPj4e8fHxWLp0Kby9vTFjxgxHln1LLAvncZiJiIhIGQ4NM1euXMHjjz+O/Px8+Pv7Y8CAAdi6dSvGjh0LAFiwYAFqamowZ84clJSUYPjw4di+fTu0Wq0jy74l/hxmIiIiUpQkhBCOLkJJ5eXl8Pf3R1lZGfz8/Oz+/tuydXjmswwMignAV3NG2P39iYiIXNGtfH473ZyZjiZAnjPDzgwREZESGGYUxmEmIiIiZTHMKKzxBOAOPqJHRETkEAwzCrOsM2M0CVTWGRxcDRERUcfDMKMwT3c3eKjNp5mXZxMREdkew4wdBPD+TERERIphmLED3p+JiIhIOW0KM2vWrMGWLVvk5wsWLEBAQACSkpJw4cIFmxXXUVjmzTDMEBER2V6bwszSpUvh5eUFAPjpp5+wcuVKLFu2DCEhIXjppZdsWmBH4N9wRROHmYiIiGyvTbczyMvLk+90vWnTJjz00EN4+umnMWLECIwePdqW9XUIHGYiIiJSTps6M76+vigqKgIAbN++HWPGjAEAeHp6oqamxnbVdRAB8sJ5vHM2ERGRrbWpMzN27Fj8+te/xqBBg3Dq1ClMmjQJAJCdnY1u3brZsr4OwZ+3NCAiIlJMmzozH3zwARITE3H16lVs2LABwcHBAICMjAxMnz7dpgV2BJwATEREpJw2dWYCAgKwcuXKJttff/31dhfUEflznRkiIiLFtKkzs3XrVuzZs0d+/sEHH2DgwIGYMWMGSkpKbFZcR8EJwERERMppU5h55ZVXUF5eDgDIysrCb37zG0ycOBHnzp3D/PnzbVpgRxDgfe1mk0RERGRbbRpmysnJQd++fQEAGzZswP3334+lS5fi4MGDmDhxok0L7AjYmSEiIlJOmzozHh4eqK6uBgDs2LED48aNAwAEBQXJHRu6xnJvpso6A+qNJgdXQ0RE1LG0qTMzcuRIzJ8/HyNGjMD+/fuxfv16AMCpU6cQFRVl0wI7Ar+GMAOYuzMhvhoHVkNERNSxtKkzs3LlSqjVavz73//GqlWr0LVrVwDAt99+i/vuu8+mBXYEbioJWk9zbuRQExERkW21qTMTExOD//73v022//nPf253QR1VgLc7KmoNvDybiIjIxtoUZgDAaDRi06ZNOH78OCRJQp8+fTBlyhS4ubnZsr4Ow9/LHXmoQTk7M0RERDbVpjBz5swZTJw4EZcuXcJtt90GIQROnTqF6OhobNmyBT169LB1nS4vwHLnbN6fiYiIyKbaNGdm7ty56NGjB/Ly8nDw4EFkZmYiNzcXcXFxmDt3rq1r7BC4CjAREZEy2tSZSUtLw759+xAUFCRvCw4Oxttvv40RI0bYrLiOxJ/3ZyIiIlJEmzozGo0GFRUVTbZXVlbCw8Oj3UV1RAHszBARESmiTWHm/vvvx9NPP42ff/4ZQggIIbBv3z7Mnj0bDzzwgK1r7BAsw0ycAExERGRbbQoz7733Hnr06IHExER4enrC09MTSUlJ6NmzJ1asWGHjEjuGgIZhplKGGSIiIptq05yZgIAAfP311zhz5gyOHz8OIQT69u2Lnj172rq+DuPaBGBezURERGRLrQ4zLd0Ne9euXfKfly9f3uaCOip/L945m4iISAmtDjOZmZmtOk6SpDYX05EF8GomIiIiRbQ6zOzcuVPJOjo8yzBTWU09hBAMfURERDbSpgnAdOssnZl6o0C13ujgaoiIiDoOhhk78XJ3g7ubuRvDK5qIiIhsh2HGTiRJujYJmAvnERER2QzDjB1dW2uGl2cTERHZCsOMHXEVYCIiIttjmLEj3p+JiIjI9hhm7Kjx5dlERERkGwwzduTP+zMRERHZHMOMHQU0XM3EYSYiIiLbYZixI38v84LLnABMRERkOwwzdhTg3dCZ4aXZRERENsMwY0ecAExERGR7DDN2JE8A5pwZIiIim2GYsSPLOjO8nQEREZHtMMzYkWWYqaLOAIPR5OBqiIiIOgaGGTuyhBkAKK81OLASIiKijoNhxo7Ubir4asyXZ3MSMBERkW0wzNiZv3x/Jl6eTUREZAsMM3YW6GMOM0WVDDNERES2wDBjZ1EB3gCAiyXVDq6EiIioY2CYsbOYYHOYyS2ucXAlREREHQPDjJ1FB3oBAHKL2ZkhIiKyBYYZO4sO4jATERGRLTHM2FlMkGWYqRpCCAdXQ0RE5PoYZuysa6AXJAmo1htRVMUrmoiIiNqLYcbONGo3hPt5AuC8GSIiIltwaJhJSUnBsGHDoNVq0aVLF0ydOhUnT560OkYIgSVLliAyMhJeXl4YPXo0srOzHVSxbVjmzeQxzBAREbWbQ8NMWloannvuOezbtw+pqakwGAwYN24cqqqq5GOWLVuG5cuXY+XKlUhPT0d4eDjGjh2LiooKB1bePjEMM0RERDajduSbb9261er56tWr0aVLF2RkZGDUqFEQQmDFihVYvHgxpk2bBgBYs2YNwsLCsHbtWjzzzDOOKLvdGk8CJiIiovZxqjkzZWVlAICgoCAAQE5ODnQ6HcaNGycfo9FokJycjL179zb7Nerq6lBeXm71cDYMM0RERLbjNGFGCIH58+dj5MiRSEhIAADodDoAQFhYmNWxYWFh8r7rpaSkwN/fX35ER0crW3gbRAeZF87L4yrARERE7eY0Yeb555/HkSNH8M9//rPJPkmSrJ4LIZpss1i0aBHKysrkR15eniL1todlAvDlshroDSYHV0NEROTaHDpnxuKFF17A5s2b8cMPPyAqKkreHh4eDsDcoYmIiJC3FxQUNOnWWGg0Gmg0GmULbqdQXw083VWorTfhcmkNuoX4OLokIiIil+XQzowQAs8//zw2btyI77//HnFxcVb74+LiEB4ejtTUVHmbXq9HWloakpKS7F2uzUiSxHkzRERENuLQzsxzzz2HtWvX4uuvv4ZWq5Xnwfj7+8PLywuSJGHevHlYunQp4uPjER8fj6VLl8Lb2xszZsxwZOntFhPkjVNXKhlmiIiI2smhYWbVqlUAgNGjR1ttX716NZ588kkAwIIFC1BTU4M5c+agpKQEw4cPx/bt26HVau1crW1FBXKtGSIiIltwaJhpzY0WJUnCkiVLsGTJEuULsiMOMxEREdmG01zN1NnIqwCXMMwQERG1B8OMg8QEN3RmihhmiIiI2oNhxkGiAs0L55XXGlBWXe/gaoiIiFwXw4yDeHuoEeJrXg+H82aIiIjajmHGgWIabmvAMENERNR2DDMOxEnARERE7ccw40C8PJuIiKj9GGYcKCqIC+cRERG1F8OMA7EzQ0RE1H4MMw5kCTOXSmpgNLW8GjIRERE1xTDjQGF+nvBwU8FgEsgvq3F0OURERC6JYcaB3FSSvHgeh5qIiIjahmHGwTgJmIiIqH0YZhyMC+cRERG1D8OMg127oolzZoiIiNqCYcbBLGHmQlGVgyshIiJyTQwzDpbQ1R8AcPRSGQor6xxcDRERkethmHGwqEBvDIjyh0kA27J1ji6HiIjI5TDMOIGJ/SMAAN9k5Tu4EiIiItfDMOMEJiaYw8xPZ4tQxKEmIiKiW8Iw4wRigr2R0NUPJgFsP3bF0eUQERG5FIYZJzEhgUNNREREbcEw4yQmNcyb2Xu2CMVVegdXQ0RE5DoYZpxEtxAf9I3wg9EkkHqMVzURERG1FsOME5nYPxwAsCWLYYaIiKi1GGaciOUS7b1nClFazaEmIiKi1mCYcSLdQ33RO1wLg0nwqiYiIqJWYphxMlxAj4iI6NYwzDgZS5j58UwhyqrrHVwNERGR82OYcTI9u/iiV5gv6o0CP5y+6uhyiIiInB7DjBPqE+EHAMgvq3FwJURERM6PYcYJBXp7AABKOMxERETUIoYZJ2QJM7w8m4iIqGUMM04oyMcdAHhbAyIiolZgmHFCARxmIiIiajWGGScU5NMQZtiZISIiahHDjBMK8DYPM7EzQ0RE1DKGGSdk6cyUVushhHBwNURERM6NYcYJWa5mMpgEKuoMzR6TdbEMsz/LQE5hlT1LIyIicjoMM07I090NXu5uAG48b+bzfRewNVuHDRkX7VkaERGR02GYcVKBLcybKaiotfovERFRZ8Uw46QCW7iiqbDSvP1qRZ3daiIiInJGDDNO6totDW4UZswh5molwwwREXVuDDNOytKZaW4VYCEEitiZISIiAsAw47Qsc2ZKm5kzU15jgN5oAmAebjKZePk2ERF1XgwzTupmw0yNh5aMJnHDoSgiIqLOgGHGSV27mqlpUCm8bp6MZTIwERFRZ8Qw46SuXc3UdJjp+nkynDdDRESdGcOMk7rZMNP1nZmrlVxrhoiIOi+GGSd1S2GGnRkiIurEGGacVKBPw5yZqvomN5ssrDAHHEkyP2eYISKizoxhxklZOjN6ownVeqPVPktnJi7YBwDDDBERdW4MM07K28MNHmrzX8/1C+dZwkyfCD8AXAWYiIg6N4YZJyVJ0g0XzrNcit0nQguAnRkiIurcGGacmGWoqbjRJGAhhBxeLJ0ZrjNDRESdGcOME7OEmdJGYaa89tqtDCxhprhKj/qGbURERJ0Nw4wTC2rmZpOW+TK+GjXC/TyhVpkvaSpid4aIiDophhknFiDf0uDanJnChiGmUK0GKpWEEF8NAM6bISKizsuhYeaHH37A5MmTERkZCUmSsGnTJqv9QggsWbIEkZGR8PLywujRo5Gdne2YYh3A0plpPMxkmR8T4mveF6I1/5erABMRUWfl0DBTVVWF22+/HStXrmx2/7Jly7B8+XKsXLkS6enpCA8Px9ixY1FRUWHnSh0jwPvGw0yWjkwoOzNERNTJqR355hMmTMCECROa3SeEwIoVK7B48WJMmzYNALBmzRqEhYVh7dq1eOaZZ+xZqkME+TS9NLtJmNEyzBARUefmtHNmcnJyoNPpMG7cOHmbRqNBcnIy9u7de8PX1dXVoby83OrhqprrzFhCC8MMERGRmdOGGZ1OBwAICwuz2h4WFibva05KSgr8/f3lR3R0tKJ1KimomUuz5c5Mw1wZyzAT15ohIqLOymnDjIVkuZtiAyFEk22NLVq0CGVlZfIjLy9P6RIV09yieVcbQkuo3JnxNG9nZ4aIiDoph86ZuZnw8HAA5g5NRESEvL2goKBJt6YxjUYDjUajeH32YLlzdm29CTV6I7w83ORLs0MahpcsVzXx/kxERNRZOW1nJi4uDuHh4UhNTZW36fV6pKWlISkpyYGV2Y+vRi0vildSrYcQQh5mCuWcGSIiIgAO7sxUVlbizJkz8vOcnBwcOnQIQUFBiImJwbx587B06VLEx8cjPj4eS5cuhbe3N2bMmOHAqu1HkiQEeHugsLIOJdV6aD3VqDOYb1tw/QTgyjoDqvUGeHs4bbONiIhIEQ795Dtw4ADuvvtu+fn8+fMBALNmzcInn3yCBQsWoKamBnPmzEFJSQmGDx+O7du3Q6vVOqpkuwvycTeHmap6eLmbuy8+Hm7w8nADYO7eeLqrUFtvQmGFHjHBDDNERNS5OPSTb/To0RBC3HC/JElYsmQJlixZYr+inIzl8uySaj081OZRQct8GcB8jkK1GuQV1+BqZS1igr0dUicREZGjOO2cGTILahRmrp8vY8FVgImIqDPjmISTs1zRVFJ1bRXgkOvDjGUSMNeaISKiTohhxskFNurMGE0Nk38bFsyz4J2ziYioM2OYcXKNw8z1VzJZ8PJsIiLqzBhmnFygjyXM1KNabwTAMENERNQYw4yTC/S2zJnRw61hAb1Q7Q0mAHMVYCIi6oQYZpzctc7Mtcm9N+rMFLIzQ0REnRDDjJOT58xU6WFsWJOnyaXZjYaZWroRJxERUUfDdWacnGWdmSq9EbX1N7+aSW80obzGYN8CiYiIHIxhxslpPdVQNWq0eHu4Nbn/kqe7G7Se5m2cN0NERJ0Nw4yTU6kkeagJaDpfxoJXNBERUWfFMOMCAhquaAKaXskkb2/miqbaeiO+PnQJZTX1zb6GiIioI2CYcQFBPo07Mx7NHtNcZ+a3G7Pw4rpD+N2mo8oWSERE5EAMMy4goA3DTFuO5GNj5qWGP19GXnG1wlUSERE5BsOMCwi6xTCjK6vFb7/KAmCeMGwSwOofzyteJxERkSMwzLiAAJ9rc2ZCbjBnxhJyCipq8cq/D6Osph79u/rjvUcHAQDWp+dy7gwREXVIDDMuoPHVTKEtzJnZe7YIu08XwtNdhT8/MhD39umCXmG+qNIbsW5/rl3qJSIisieGGRfQeJippauZjCbzKsG/ndgHPbv4QpIk/HpkdwDAJ3vPo95oUrhaIiIi+2KYcQGNL82+0ZyZLo1CzqheoXj8zlj5+ZRBkQjx1SC/rBZbjuQrVygREZEDMMy4AOtLs5sPM8G+GsQGeyPMT4M/PjTA6v5MGrUbZiWaw81Hu89BNNzjiYiIqCNgmHEBlqElrUYNH03z9wZ1U0nYNm8Uvv/NaIT5eTbZ/9idsfB0VyH7cjl+OlekaL1ERET2xLtmu4DYYB+8Mv42RAd53/Q4T3e3G+4L9PHAQ0Oi8Pm+XKz8/gxUkgRPdzd4uqvg7a5GZIAn1G7MtkRE5Hok0cHHHMrLy+Hv74+ysjL4+fk5uhyHOne1EvcuT0Nzf+M+Hm4YHBuIO7oFYVhcEAZGB9w0HBERESnpVj6/2ZnpRLqH+uLlcbfhm6x81NYbUVtvQp3BiIpaA6r0Ruw+XYjdpwsBAP5e7vjvCyNb7AYRERE5GjszBJNJ4OSVCuzPKcb+88XYc7oQZTX1mD+2F+beG+/o8oiIqBO6lc9vTpIgqFQS+kT4YVZSN3wwYzAWT+oDAPgmi5dxExGR82OYoSbG9Q2DWiXhhK4CZ69WOrocIiKim2KYoSYCvD0womcIAOBbdmeIiMjJMcxQsyb2DwcAbMnSKfo+ZwoqMevj/ci4UKzo+xARUcfFMEPNGtc3HG4qCcfzy5FTWKXY+yzbegJpp67izS3HFXsPIiLq2BhmqFmBPh5I6hEMQLmJwJdLa7Dj+BUAQGZuKU7qKhR5HyIi6tgYZuiGJvaPAAB8e/TWw8yV8lqkHruCz/ddQLXe0Owxa3/OhanRwgDr0nPbVKcjrNufi2VbT6DOYHR0KUREnR4XzaMbGtc3DL/bdBRHL5Ujt6gaMcHNL6BnMgkcyy/HrpMFOJhbiqxLZbhaUSfvP5xXij/+4nar1+gNJjm8PDw0Cv86cBFfZV7Cwvt6O/3Kw6nHruDVjVkAgPyyWix/+HarG3sSEZF9sTNDNxTsq8Gd3YMAAN9c152prTfi26x8LPj3YQxP+Q73v78H72w/he9PFOBqRR1UEtArzBcA8GXGRWRcKLF6/dZsHQor9eii1eCNqQmI9PdEaXU9tmU3P+G4tt45OiAFFbVYuOGI/PyrzEtYnnrKgRURERHDDN3UhATzUFPjeTP7c4ox4d3dePaLg/jXgYu4WlEHbw83jOsbhtcm98WGZxOR/fp92P5SMn4xJAoA8NrmozA2GlP6/KcLAIDpd8RAo3bDL4ZGAwDW7c9rUsO2bB0GvL4dM/++D1fKaxX7XlsihMDCfx9BcZUefSL88ObUBADA+9+fwb/Sm9ZNRET2wWEmuqnx/cLx/74+iiMXy3BSV4Evfr6ATxuCSKhWg8kDInFP7y4YFhcIjbrp8NDCCb2xNVuHo5fKsXZ/Lh6/MxYndOXYf74YbioJM4bHAAAeHhaN974/jZ/OFeF8YRW6hfgAMN8c8zf/Ogy9wYQfzxRhwru78c4vBuCe3mGKfc+px66gpt6ICQnhcG90J/HP9l3AzpNXoVGr8N6jAxEfpsWV8lq8//0ZLPoqC2H+nkjuFQohBPLLanE4rxRVeiNGxYegi5+nYvUSEXV2DDN0U6FaDe6IC8K+c8WY/P4e6I0mAMD0O6KxaGIf+Hm63/T1Ib4a/GZsLyz5zzG8s+0kJvWPwGcNYWh8vzCENXzIdw3wwqj4UKSduor1B/Kw8L7eqNEbMeeLg6isM2BwTADqDCZkXy7Hrz45gF+NiMPCCbc1G6Bu5nJpDXw0avh7NV/3ql1n8X9bTwAAogK98NzdPfE/g6NwoagKbzVcPr5oQm/Eh2kBAPPH9sLFkhp8lXkJcz7PwJ3dg3H4YhkKK6/NGZIkYHBMIO7rF47x/cJvOPeIyJ7OXq3EK18exqheoXjx3njO+yKXxhtNUos+++k8fv91NgAgOsgLb08bIK8Q3BoGown3v78HJ3QVeOD2SOw4fgXVeiPWPjUcST2ufZ2tR/Mx+/ODCNVqsPfVe7Dw30ewMfMSQnw1+GbuSPh7u+Ptb09g9Y/nAQAxQd6IDfaGt4cbfDzU8NGoMb5fOEbGN1/bvzMu4tUNR+Dt4YY3H+yPB26PtNr/3nen5fkvWk81KmrNV2FF+nvC090N5wqrkNwrFJ/8cpjVD369wYQnPv4Z+85dW/hPrZJwW7gWajcVDueVWr3P8LggzBvTC4kNl743VlVnwM6TBfD2cMOIniG3HNZciRACOYVV8PNyR4ivxtHldCoVtfWY8sGPOHfVvIbUK+Nvw3N393RwVcoymQRUKtcPbPVGEyQAareOP0vkVj6/GWaoRZV1Bry64Qi6Bnph7j3x8NHcekMv/XwxfvHXn+TnPbv4IvWlUU1CQdLb36GwUo/7+oVja7YObioJX/x6OO7sfu2D/7vjV/Dyl4dRUl3f7Hs9dVccXhnfGx5q8//sQgj8ZddZ/HHbSavjpgyMxB8eSICflxrLU0/h/e/PADD/YP/ViDis3Z+Lv6WdRUHDlVlBPh7YOu8udNE2HTIqq6nH33efQ6C3B26PDkC/SD/5qixdWS22H9Nh61Edfs4plucOWULNnd2DsD+nGF9mXMQ3Wfmo1psnO2s1aoztG4ZJAyIwMj4ENXojLpbU4GJJDS6X1qBHF18k9wpt9hwUVNTihbWZuFhSg2mDu+KRYdGICrTuCJ29WomtR3WorDNg5vCYJvtbIoRAXnENDuaWIDO3BEVVevxyRDcMiQ1q9niTSSAjtwTp54uRcb4EB3NLUFJdD7VKwuJJffBkUrdmuwPnC6twpbwWw7s3DX8WRZV1+DmnGF20GnQN9EIXrSfcOsAHFwD5/mg9Qn1t8vVMJoHZn2dg+7Er0GrUqKgzh/a3p/XHo3fE2OQ9bEEIgV2nrmLjwUuI9PfEw8Oi23wOTl+pwFOfHoCHWoU3p/bHHXHN/xttTU0ndBU4daUCapUKbioJapUED7UKQ7sFwtvD9oMdecXVyMwrxaHcUhzKK8HRy+UI9dVg3dN3IjrIdl3e8tp6nMivQHSQFyL8vWz2dduDYaYRhhnnMX/9IWzMvAQAeP2BfpiV1K3JMSnfHMfffjgnP391Qm/MTu7R5LiSKj32ny9Gtd6AqjojqvUGnNRVYsPBiwCA26MDsHL6IEQGeOG1zUfx+T7zZeDPJHeHRu2GD3aegdEkEOHviaQeIfLrFk/sg6dGdZffp7beiH8dyMO2bB2eu7unVSepLS6X1mDVrrNYn54nD9kFertbBbNuwd6oqTfiSvm1oSqVBKs1eSz+Z3AU/jCln1XAPKmrwK8+Scel0hp5myQByb1CMW1wFM4WmEPMySvXFin0UKvwvyPj8OzoHi0OHZZU6fHmluNIO1WAwkq91T43lYT5Y3vh2eQeVr8Fn9RVYPFXWThw3VVtapUEQ8M3Nm1QV7z1YH94eZhDYGWdAe99dxof78mBwSTwzi9ux0MNE8obq603YsrKH62+H7VKQkSAJ4J8NPDVmDt3vhpz98674bm3hxu8PdQwmEwortKjpEqPkup6VOuNGB4XhEkDIhAZ4Ngf6j+dLcITH/+MeqPAhIRwzBvTC7eFa62OKa3WY8fxApTV1KNfpB8SuvrD9ya/cHyw8wz+uO0kPNxU+NfsRKQe0+GDnWehkoC/PjYE4/qFy8dW6w346WwRTML87zI6yLtdSycIIVoczjKZBLYfu4KVO0/j6KVyq33DugXikWExmNg/vNXB4VBeKZ5cvR+ljf4feyIxFgvu633T89TY+cIqbD58GZsPX8aZguZvvhsT5I2PnxyGnl1aDlx6gwnfn7iCrEtlGBwTiBE9Q6zOa229Ed9k5eOzfReQmVva7NfoEeqDDc8mIcDbo8k+g9GEOoPppr94XiqtwXfHr+BQXikO55XibEOXTiWZ50r+ckQchnULbPHvq1pvwHvfncHce3vaPMwxzDTCMOM8Cipqcd+K3VBJwM6XR0PbzIfmuauVuOdPaQDM69z87fEhtzSWvy1bh1e+PIzyWgO0nmr0i/TDvnPFkCTgtfv74skRcQCAg7klmL/+EM4XVcuvXTL52n6lXR9qfDzccP+ASPxiaBSGxAZCCCAjtwRbjuTj26P5crAJ8dUgKtALwT4e2HmyACZh/qG2csZg9Inww66TBXh+bSYq6wyIC/HB7OTu2Hz4Mn48U9SkBnc3CSN6mjs+P+eYh8iCfTwwb0w8Hr0jxmrys8Wxy+V45vMDyCuukb9G30h/DI4JQEFFHbYcMV/1NrJnCJY/cju0Gne8+91p/H33ORhMAt4ebhh9WyiGxAZhSGwg+kb44dOfziPl2xMwmgT6Rvjhb48PQWZeKd7acswq0Hl7uOG/L4xE9+t+O1/8VRa++DkXWo0aAT7uyC+tlQNSe93RLQiTB0YiOT4Uwb4e8PZwu+m/x2q9ATtPXMU3R/Ox90wh4rto8Uxyd9zTu8stz0k5d7USD/5lL8pqrn0IS5J5Mcv/HRmHk7oKfJOVj5/OFll9v5IEdA/xwYCoAIzsGYJ7endBoI/5A2/XyQL88pN0CHGtEyOEwMINR/CvAxehUavw18eHoLymHt9m6bDrVAFq601WdYX7eSI+zBfzx/bCoJjAFr8Pg9GErzIvYdWus7hYWoPuIT7oEeqLHqE+6BbiA5MwD69WNjy+O34Fp66YA4O3hxseHhqNiyXV+P5EgRzo/TzVzQ4VX+/HM4V46tMDqNYbcXt0AHqHabH+gPnKw64BXnjrwQSMvq1Ls6+trTdi86HL+GJ/rtVQsYdahYFRAZAkwGgSMJgELpZUo7BSDz9PNVY9NuSGw/DnrlZifXoeNhy8aPWLgKe7quHvKgy5xdX414E8FFeZ96tVEvpG+mFgdAAGRgcgNtgHz689iPyyWgyNDcTnvx5uFYQO5ZVi/r8OIa+4GuP7hWPGHTFI7BEs//s7crEUH+3OwTdZ+VZXmAJAF61G7kQDQL9IP/xyRBymDoxsdlhLV1aLX3+ajqOXyjH59ki8P33Qzf46bhnDTCMMM86lsLIOEsxr2NzIW1uO4XRBJd59dNANJ+rezMWSasz9ZyYONvxG46FW4d1HBmJCw4rGFlV1Brz1zXFsOZKPhff1lq+ssiddWS1OF1RgSOyNW9Qmk0B+eS2CfTysfmjtO1eEF9dl4kp5HTzUKjw4sCu+zMiDSZiHsP72+BD5t7acwiqs25+LHcevIC7EFxMSwjGmTxj8vd0hhMCO4wVI+fa4PIeia4AXfjmiGx4eFi13ajYfvowF/z6M2noTYoK8kTKtP4bEBso1CSHwZcZFvPZ1NmrqjQjxNdd7scQcfMb2DcOSB/qhazPdjr1nC/HC2kwUVenh7iah3mj+sRQb7I3/d39ffLT7HPadK0ZCVz9sfHaEPIRomWcFAJ/97x24Kz4URpPAlfJaXCqtQWl1vfxBWdXwqNYbUaU3okZvQJXeCLVKQqCPBwK93RHYcL62ZeuQfr6kSZ0eapV8nJ+XO/w83eHnZZ5Qnl9a2+yHPwDcFqbF7NHdcf+AyGZD4vVKqvSYtmovcgqrMDA6AH+Y0g9/TTuLb25w49c+EX6IDvRC9uVyq44cYO6W3dEtCMm3hWLVrrMoq6nH9DtikDKtv3yMwWjC7M8PyrcXaSwq0Av+Xu7ILaqWh6QA84fsb8bdhmdGdW92LoolxKzceQYXGv3S0BpajRpPjuiGX46IQ1BDENOV1WLDwYtYn56H3GLz15t+RzT+3/395G5eY1uP6jD3n5nQG00Y0TMYf3t8KHw1avx4phCvbjwiB/LuoT4YFR+K5F6hGN49CCXV9fjspwtYl54rd3NUEjCiZwimDOyKcf3CmnQviyrr8PRnGci4UAK1SsKbUxPkIbsLRVVIPXalyb+pUK0GI3oEI/18SZO/M8A8V2/mnbF4eGg0QrXWPy9PXanA/6zai4paA+7rF44PZg6GEAIf7DyL974/3SSkxIX44IHbI7HvXJH8iwtgDuuJPYIxMDoAA6L8EeyrwUldBT7Zm4ONBy+hzmD+t5zQ1Q8pDw5A/yh/+bVHL5Xh12sOQNfws+nDJ4ZiSGzL4fZWMMw0wjDTOdUbTXh3x2n8cPoqfjep703HyFvT+nZWxVV6vPzlYXx/okDe9tCQKCx9sL/8gd9a9UYT/rk/F+/uOI2iht8KfTVqPNywBtDHP+YAAEb1CsV7jw5str0NAGcKKvDcF5nysE+kvyeWPNDPaviiOZdLazD78wwcuVgGjVqF5+7uiadHdYenuxt0ZbW4790fUFpdj6fuisPiSX1xqbQGE1b8gPJaA2Yn98CrE3rf0vfbksulNdhyJB//OXIZJ/Ir5GHBlsQEeWNi/wgk9wrFrlMF+GJfLiobQkCgtzv8vdzh7qaC2k0FDzfzb92PDovBgCh/SJIEvcGEx//xM37OKUbXAC9sem6E/GF27HI53v3uFHYcL0CfCC0mJERgYv8IxDUsZQCYf2HIulSGgxdKkHrsCk5cd8+zgdEBWP/MnU0ml9fWG/HEx/uxP6cY3UN8MKF/OCYkRKBfpB8kSYIQAiXV9ThfVIWP9+Tgv9d14bpoPWEyCRy+WIqdJ6/i60OX5BAT7OOBZ5K7Y0yfMFworsbZgkqcvVqFC0VVULuprg0FeqoRFeiNh4ZE3fAXGYPRhHe/O42VO89ACHNQXDljEOLDtCioqMXeM0XYfboQX2VehEkA9/ULx7vTB1p9v9V6A97Zdgqf/nTeqqvl4aaCwWSSO0BdA7zweGIs/mdwVJNAcb3aeiMWbjiCrw9dBmB+37NXK3G60bCUSgJG39YFjw6Lxt29u8DdTQUhBI7nV+C741eQduoqtJ5qTL8jBvf07nLTSb77zhXhiX/sh95owkNDonCmoBKHGjpIk2+PxBOJsfj60CVsyrws//sDzCH0gdsj8b93xaFfpP8Nvro5UP8zPRd/3XUW5bUGqCRgVlI3/Gbcbdh3tghz12WiWm9EfBdffPzkMJvO37FgmGmEYYY6OpNJ4OMfc7D6x/N4PDEWz4zq3q5wVltvxFeZl/CPPTlN5gc8O7oHXh53W4uTa2vrjfhg5xlIkoRnRnVv9aTx2nojtmXrMDgmsMkPx9RjV/DUpwcAAP+YNRSrdp3FgQsluD06AP+endiqjkdbCSFQrTeipFqP0up6FFfpUV5bj/IaA8pr61FWUw9PtRvG9O2CvhF+Vue/rKYen++7gNU/5jSZY9RYv0g/TL8jBgdzS7Dx4CX4atTY8GxSkzkylnpa+3ecW1SN7cd02H7sCuoMJvz1scE3nOBZbzShoKIOkf6eN/36Qgh8eeAiXtts7sIF+3hgZHwI9pwulIMwcC3EPHZnrM3nU+w5XYh56w+hsLIOXu5uiA7ykoenLH4xJAop0/rfMBSU1dTjp7OFSDtViB9OXZU7JEk9gjErqRvG9Am7pYnkQgi8+91prNhxWt6mVkkY3j0IY/qE4b6EcJtOrv3P4ct44Z+Z8nOtpxpvTk3AlIFd5W1VdQb85/Bl7Dh+BT27aDErKfaWarhaUYc3txyTQ1qIrwZFVXUQArgrPgQfzBzc4jy7tmKYaYRhhqhthBBIO3UV/9iTg5O6Crw2uR8mDYho+YUKeu3ro1jz0wW4qSQYTQK+GjW+mXuXS6zdU1tvxEmducNTbzTBYBSo1huwLfsKtmTlQ2+41vlxU0n4+MlhN7xazVmcKajA82szrbo/Wo0ad/UKwejbuuD+ARGKXOFjcbWiDi+tP4Q9ZwoBmOcL9Y3ww8ieIRjVKxRJjeaKtEQIgfNF1VCrpHZ3Gb7NysfOkwUY0TMEo3t1gb+3Mh/2APDxnhy8seUY7owLxjsP397sMK4tpJ26it9typKH52YMj8HrD/RT9JcIhplGGGaIOo7aeiOmfvCj/OH57qMDrX4LdVUlVXpszLyEf+7PxbmrlXhjagJmDo91dFmtUltvxKc/nUdJdT1GxYdiaLdART/grmcyCew4fgX1RoHEHsHyHJvOpLy2HlqNWvHh8hq9EZ/tO49gHw2mDe6q+PsxzDTCMEPUsZwpqMDTn2bgnt5d8Lv7+zq6HJsSQqBKb2z1JcNEHdmtfH7z/xgicik9u2jx/cujHV2GIiRJYpAhaoOOvx4yERERdWgMM0REROTSGGaIiIjIpTHMEBERkUtjmCEiIiKXxjBDRERELo1hhoiIiFwawwwRERG5NIYZIiIicmkMM0REROTSXCLM/OUvf0FcXBw8PT0xZMgQ7N6929ElERERkZNw+jCzfv16zJs3D4sXL0ZmZibuuusuTJgwAbm5uY4ujYiIiJyA0981e/jw4Rg8eDBWrVolb+vTpw+mTp2KlJSUFl/Pu2YTERG5nlv5/Hbqzoxer0dGRgbGjRtntX3cuHHYu3dvs6+pq6tDeXm51YOIiIg6Lqe+13xhYSGMRiPCwsKstoeFhUGn0zX7mpSUFLz++utNtjPUEBERuQ7L53ZrBpCcOsxYSJJk9VwI0WSbxaJFizB//nz5+aVLl9C3b19ER0crWiMRERHZXkVFBfz9/W96jFOHmZCQELi5uTXpwhQUFDTp1lhoNBpoNBr5ua+vL/Ly8qDVam8YgNqqvLwc0dHRyMvL43wchfFc2w/Ptf3wXNsPz7X92OpcCyFQUVGByMjIFo916jDj4eGBIUOGIDU1FQ8++KC8PTU1FVOmTGnV11CpVIiKilKqRACAn58f/+ewE55r++G5th+ea/vhubYfW5zrljoyFk4dZgBg/vz5ePzxxzF06FAkJibiww8/RG5uLmbPnu3o0oiIiMgJOH2YeeSRR1BUVIQ//OEPyM/PR0JCAr755hvExsY6ujQiIiJyAk4fZgBgzpw5mDNnjqPLaEKj0eC1116zmqNDyuC5th+ea/vhubYfnmv7ccS5dvpF84iIiIhuxqkXzSMiIiJqCcMMERERuTSGGSIiInJpDDNERETk0hhm2ugvf/kL4uLi4OnpiSFDhmD37t2OLsnlpaSkYNiwYdBqtejSpQumTp2KkydPWh0jhMCSJUsQGRkJLy8vjB49GtnZ2Q6quONISUmBJEmYN2+evI3n2nYuXbqExx57DMHBwfD29sbAgQORkZEh7+e5tg2DwYDf/e53iIuLg5eXF7p3744//OEPMJlM8jE8123zww8/YPLkyYiMjIQkSdi0aZPV/tac17q6OrzwwgsICQmBj48PHnjgAVy8eNE2BQq6ZevWrRPu7u7io48+EseOHRMvvvii8PHxERcuXHB0aS5t/PjxYvXq1eLo0aPi0KFDYtKkSSImJkZUVlbKx7z99ttCq9WKDRs2iKysLPHII4+IiIgIUV5e7sDKXdv+/ftFt27dxIABA8SLL74ob+e5to3i4mIRGxsrnnzySfHzzz+LnJwcsWPHDnHmzBn5GJ5r23jzzTdFcHCw+O9//ytycnLEl19+KXx9fcWKFSvkY3iu2+abb74RixcvFhs2bBAAxFdffWW1vzXndfbs2aJr164iNTVVHDx4UNx9993i9ttvFwaDod31Mcy0wR133CFmz55tta13797i1VdfdVBFHVNBQYEAINLS0oQQQphMJhEeHi7efvtt+Zja2lrh7+8v/vrXvzqqTJdWUVEh4uPjRWpqqkhOTpbDDM+17SxcuFCMHDnyhvt5rm1n0qRJ4le/+pXVtmnTponHHntMCMFzbSvXh5nWnNfS0lLh7u4u1q1bJx9z6dIloVKpxNatW9tdE4eZbpFer0dGRgbGjRtntX3cuHHYu3evg6rqmMrKygAAQUFBAICcnBzodDqrc6/RaJCcnMxz30bPPfccJk2ahDFjxlht57m2nc2bN2Po0KH4xS9+gS5dumDQoEH46KOP5P0817YzcuRIfPfddzh16hQA4PDhw9izZw8mTpwIgOdaKa05rxkZGaivr7c6JjIyEgkJCTY59y6xArAzKSwshNFobHLX7rCwsCZ396a2E0Jg/vz5GDlyJBISEgBAPr/NnfsLFy7YvUZXt27dOhw8eBDp6elN9vFc2865c+ewatUqzJ8/H7/97W+xf/9+zJ07FxqNBk888QTPtQ0tXLgQZWVl6N27N9zc3GA0GvHWW29h+vTpAPjvWimtOa86nQ4eHh4IDAxscowtPjsZZtpIkiSr50KIJtuo7Z5//nkcOXIEe/bsabKP57798vLy8OKLL2L79u3w9PS84XE81+1nMpkwdOhQLF26FAAwaNAgZGdnY9WqVXjiiSfk43iu22/9+vX4/PPPsXbtWvTr1w+HDh3CvHnzEBkZiVmzZsnH8Vwroy3n1VbnnsNMtygkJARubm5NkmRBQUGTVEpt88ILL2Dz5s3YuXMnoqKi5O3h4eEAwHNvAxkZGSgoKMCQIUOgVquhVquRlpaG9957D2q1Wj6fPNftFxERgb59+1pt69OnD3JzcwHw37UtvfLKK3j11Vfx6KOPon///nj88cfx0ksvISUlBQDPtVJac17Dw8Oh1+tRUlJyw2Pag2HmFnl4eGDIkCFITU212p6amoqkpCQHVdUxCCHw/PPPY+PGjfj+++8RFxdntT8uLg7h4eFW516v1yMtLY3n/hbde++9yMrKwqFDh+TH0KFDMXPmTBw6dAjdu3fnubaRESNGNFli4NSpU4iNjQXAf9e2VF1dDZXK+mPNzc1NvjSb51oZrTmvQ4YMgbu7u9Ux+fn5OHr0qG3OfbunEHdClkuz//GPf4hjx46JefPmCR8fH3H+/HlHl+bSnn32WeHv7y927dol8vPz5Ud1dbV8zNtvvy38/f3Fxo0bRVZWlpg+fTovq7SRxlczCcFzbSv79+8XarVavPXWW+L06dPiiy++EN7e3uLzzz+Xj+G5to1Zs2aJrl27ypdmb9y4UYSEhIgFCxbIx/Bct01FRYXIzMwUmZmZAoBYvny5yMzMlJckac15nT17toiKihI7duwQBw8eFPfccw8vzXa0Dz74QMTGxgoPDw8xePBg+fJhajsAzT5Wr14tH2MymcRrr70mwsPDhUajEaNGjRJZWVmOK7oDuT7M8Fzbzn/+8x+RkJAgNBqN6N27t/jwww+t9vNc20Z5ebl48cUXRUxMjPD09BTdu3cXixcvFnV1dfIxPNdts3PnzmZ/Ps+aNUsI0brzWlNTI55//nkRFBQkvLy8xP333y9yc3NtUp8khBDt7+8QEREROQbnzBAREZFLY5ghIiIil8YwQ0RERC6NYYaIiIhcGsMMERERuTSGGSIiInJpDDNERETk0hhmiIiIyKUxzBCRzXTr1g0rVqxo9fG7du2CJEkoLS1VrCZncqvnh4haR+3oAojIcUaPHo2BAwfa7AM2PT0dPj4+rT4+KSkJ+fn58Pf3t8n7E1HnxDBDRDclhIDRaIRa3fKPi9DQ0Fv62h4eHggPD29raUREADjMRNRpPfnkk0hLS8O7774LSZIgSRLOnz8vD/1s27YNQ4cOhUajwe7du3H27FlMmTIFYWFh8PX1xbBhw7Bjxw6rr3n9MIokSfj73/+OBx98EN7e3oiPj8fmzZvl/dcPM33yyScICAjAtm3b0KdPH/j6+uK+++5Dfn6+/BqDwYC5c+ciICAAwcHBWLhwIWbNmoWpU6fe9Pvdu3cvRo0aBS8vL0RHR2Pu3Lmoqqqyqv2NN97AjBkz4Ovri8jISLz//vtWXyM3NxdTpkyBr68v/Pz88PDDD+PKlStWx2zevBlDhw6Fp6cnQkJCMG3aNKv91dXV+NWvfgWtVouYmBh8+OGHN62biFrGMEPUSb377rtITEzEU089hfz8fOTn5yM6Olrev2DBAqSkpOD48eMYMGAAKisrMXHiROzYsQOZmZkYP348Jk+ejNzc3Ju+z+uvv46HH34YR44cwcSJEzFz5kwUFxff8Pjq6mq88847+Oyzz/DDDz8gNzcXL7/8srz///7v//DFF19g9erV+PHHH1FeXo5NmzbdtIasrCyMHz8e06ZNw5EjR7B+/Xrs2bMHzz//vNVxf/zjHzFgwAAcPHgQixYtwksvvYTU1FQA5g7V1KlTUVxcjLS0NKSmpuLs2bN45JFH5Ndv2bIF06ZNw6RJk5CZmYnvvvsOQ4cOtXqPP/3pTxg6dCgyMzMxZ84cPPvsszhx4sRN6yeiFtjk3ttE5JKSk5PFiy++aLVt586dAoDYtGlTi6/v27eveP/99+XnsbGx4s9//rP8HID43e9+Jz+vrKwUkiSJb7/91uq9SkpKhBBCrF69WgAQZ86ckV/zwQcfiLCwMPl5WFiY+OMf/yg/NxgMIiYmRkyZMuWGdT7++OPi6aefttq2e/duoVKpRE1NjVz7fffdZ3XMI488IiZMmCCEEGL79u3Czc1N5Obmyvuzs7MFALF//34hhBCJiYli5syZN6wjNjZWPPbYY/Jzk8kkunTpIlatWnXD1xBRy9iZIaJmXd9RqKqqwoIFC9C3b18EBATA19cXJ06caLEzM2DAAPnPPj4+0Gq1KCgouOHx3t7e6NGjh/w8IiJCPr6srAxXrlzBHXfcIe93c3PDkCFDblpDRkYGPvnkE/j6+sqP8ePHw2QyIScnRz4uMTHR6nWJiYk4fvw4AOD48eOIjo626l5ZzoXlmEOHDuHee++9aS2Nz4ckSQgPD7/p+SCilnECMBE16/qrkl555RVs27YN77zzDnr27AkvLy889NBD0Ov1N/067u7uVs8lSYLJZLql44UQTbY1dv3+65lMJjzzzDOYO3duk30xMTE3fa3lvYQQTd73+u1eXl43/VrArZ8PImoZOzNEnZiHhweMRmOrjt29ezeefPJJPPjgg+jfvz/Cw8Nx/vx5ZQu8jr+/P8LCwrB//355m9FoRGZm5k1fN3jwYGRnZ6Nnz55NHh4eHvJx+/bts3rdvn370Lt3bwDmLkxubi7y8vLk/ceOHUNZWRn69OkDwNx1+e6779r9fRLRrWFnhqgT69atG37++WecP38evr6+CAoKuuGxPXv2xMaNGzF58mRIkoTf//73DukovPDCC0hJSUHPnj3Ru3dvvP/++ygpKWm2a2KxcOFC3HnnnXjuuefw1FNPwcfHB8ePH0dqaqrVFUs//vgjli1bhqlTpyI1NRVffvkltmzZAgAYM2YMBgwYgJkzZ2LFihUwGAyYM2cOkpOT5SG51157Dffeey969OiBRx99FAaDAd9++y0WLFig7Ekh6uTYmSHqxF5++WW4ubmhb9++CA0Nven8lz//+c8IDAxEUlISJk+ejPHjx2Pw4MF2rNZs4cKFmD59Op544gkkJibK8188PT1v+JoBAwYgLS0Np0+fxl133YVBgwbh97//PSIiIqyO+81vfoOMjAwMGjQIb7zxBv70pz9h/PjxAMzDQZs2bUJgYCBGjRqFMWPGoHv37li/fr38+tGjR+PLL7/E5s2bMXDgQNxzzz34+eeflTkRRCSTREuDzURETsxkMqFPnz54+OGH8cYbb7T563Tr1g3z5s3DvHnzbFccEdkFh5mIyKVcuHAB27dvR3JyMurq6rBy5Urk5ORgxowZji6NiByEw0xE5FJUKhU++eQTDBs2DCNGjEBWVhZ27NghT8Ilos6Hw0xERETk0tiZISIiIpfGMENEREQujWGGiIiIXBrDDBEREbk0hhkiIiJyaQwzRERE5NIYZoiIiMilMcwQERGRS/v/FYgVBBiTCpwAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9ce8e610",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-16T06:03:24.353311Z",
     "start_time": "2025-04-16T06:03:24.342709Z"
    }
   },
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9379c8b3-33b8-46af-a935-4f09eb35eb4d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ff5d7a9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8bba4f1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59c49432",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
